import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

"""
This file is to calculate effect size cohen d of the difference in effort, 
dropout rating and believe in usefulness beliefs before and after doing the persuasive activities.


@author: Jonathan van Oudheusden
@date: 2024-01-28

Required files: 
    - combinedCodes.xlsx
    with these respective columns renamed to:
    # state_1 = Self-efficacy
    # state_2 = Practical knowledge
    # state_3 = Awareness of positive outcomes
    # state_4 = Awareness of negative outcomes
    # state_7 = mindset that physical activity helps to quit smoking
    
Output files:
    - plot.png
    - effort.png
"""


# Load the dataset
file_path = 'combinedCodes.xlsx'
df = pd.read_excel(file_path)

# Relevant clusters for analysis
relevant_clusters = ["Self-efficacy", "Practical knowledge", "Awareness of positive outcomes", 
                     "Awareness of negative outcomes", "Mindset that physical activity helps to quit smoking"]

# Columns to calculate Cohen's d
columns_to_analyze = ['dropout_response', 'effort', 'Self-efficacy', 'Practical knowledge', 'Awareness of positive outcomes', 'Awareness of negative outcomes', 'mindset that physical activity helps to quit smoking']


# Function to calculate Cohen's d
def calculate_cohens_d(group1, group2):
    """Calculate Cohen's d for effect size between two groups."""
    mean1, mean2 = np.mean(group1), np.mean(group2)
    sd1, sd2 = np.std(group1, ddof=1), np.std(group2, ddof=1)
    n1, n2 = len(group1), len(group2)
    pooled_sd = np.sqrt(((n1 - 1) * sd1 ** 2 + (n2 - 1) * sd2 ** 2) / (n1 + n2 - 2))
    d = (mean1 - mean2) / pooled_sd
    return d

def calcCohen(activity_cluster):
    # Initialize a dictionary to store changes for each column
    changes_dict = {col: [] for col in columns_to_analyze}

    # Iterate through the DataFrame to calculate changes
    for i in range(len(df) - 1):
        current_row = df.iloc[i]
        next_row = df.iloc[i + 1]

        # Ensure the same participant and relevant clusters
        if current_row['rand_id'] == next_row['rand_id'] and current_row['cluster_new_index'] in activity_cluster:
            for col in columns_to_analyze:
                if pd.notna(current_row[col]) and pd.notna(next_row[col]):
                    change = next_row[col] - current_row[col]
                    changes_dict[col].append(change)

    # Calculate Cohen's d for each column
    cohens_d_results = {}
    for col, changes in changes_dict.items():
        # Separate the changes into two groups for comparison
        group1 = changes[::2]  # Odd index changes
        group2 = changes[1::2]  # Even index changes
        cohens_d_results[col] = calculate_cohens_d(group1, group2)

    # Print the results
    for column, value in cohens_d_results.items():
        print(f"Cohen's d for {column}: {value}")

    return cohens_d_results

    
# Collecting data in a list
plot_data_list = []

# Calculate Cohen's d for all relevant clusters and store in the list
for cluster in relevant_clusters:
    print(cluster)
    results = calcCohen([cluster])
    for state, cohens_d in results.items():
        plot_data_list.append({"Persuasive Activities": cluster, "State": state, "Cohens_d": cohens_d})

# Calculate Cohen's d for overall and store in the list
overall_results = calcCohen(relevant_clusters)
for state, cohens_d in overall_results.items():
    plot_data_list.append({"Persuasive Activities": "Overall", "State": state, "Cohens_d": cohens_d})

# Convert list to DataFrame
plotting_data = pd.DataFrame(plot_data_list)

# Plotting the bar chart
plt.figure(figsize=(12, 9))
sns.barplot(x="State", y="Cohens_d", hue="Persuasive Activities", data=plotting_data)
plt.title("Cohen's d Values per Activity")
plt.xlabel("Dropout Response, Effort and Usefulness Beliefs")
plt.ylabel("Standardized Mean Difference")
plt.xticks(rotation=15)
plt.tight_layout()

# Save the plot to a file
plt.savefig('plot.png') 



# Calculate the average 'effort' for each cluster
avg_effort_per_cluster = df[df['cluster_new_index']
                            .isin(relevant_clusters)].groupby('cluster_new_index')['effort'].mean()

# Plotting the average effort in a line graph
plt.figure(figsize=(10, 6))
avg_effort_per_cluster.plot(kind='line', marker='o')
plt.title('Average Effort per Activity')
plt.xlabel('Persuasive Activities')
plt.ylabel('Average Effort')
plt.xticks(range(len(relevant_clusters)), relevant_clusters, rotation=45)
plt.grid(True)
plt.tight_layout()

plt.savefig('effort.png')
